The goal of this notebook is to compare label transfer results between:
main at
commit 6af112d. These results are referred to as
"azimuth".3e2f90. These results are referred to as
"adapted_azimuth".2024-08-22
OpenScPCA data release.knitr::opts_chunk$set(message = FALSE, warning = FALSE)
options(future.globals.maxSize = 891289600000000)
suppressPackageStartupMessages({
library(tidyverse)
library(patchwork)
library(Seurat)
})
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")
# functions to perform label transfer with azimuth-adapted approach
source(
file.path(module_base, "notebook_template", "utils", "label-transfer-functions.R")
)
# Output files
full_results_file <- file.path(module_base, "scratch", "compare-label-transfer_fetal-full.rds")
kidney_results_file <- file.path(module_base, "scratch", "compare-label-transfer_fetal-kidney.rds")
# Make a heatmap of counts for label transfer strategies
plot_count_heatmap <- function(df, title, sample_id) {
all_preds <- union(df$azimuth, df$adapted_azimuth)
plotme <- data.frame(
azimuth = all_preds,
adapted_azimuth = all_preds
) |>
expand(azimuth, adapted_azimuth) |>
mutate(n = NA_integer_) |>
anti_join(distinct(df)) |>
bind_rows(
df |> count(azimuth, adapted_azimuth)
) |>
arrange(azimuth) |>
mutate(
color = case_when(
is.na(n) ~ "white",
n <= 20 ~ "grey90",
n <= 50 ~ "lightblue",
n <= 100 ~ "cornflowerblue",
n <= 500 ~ "red",
n <= 1000 ~ "yellow2",
.default = "yellow"
)
)
ggplot(plotme) +
aes(x = azimuth, y = adapted_azimuth, fill = color, label = n) +
geom_tile(alpha = 0.5) +
geom_abline(color = "firebrick", alpha = 0.5) +
geom_text(size = 3.5) +
# scale_fill_viridis_c(name = "count", na.value = "grey90") +
scale_fill_identity() +
theme_bw() +
theme(
axis.text.y = element_text(size = 7),
axis.text.x = element_text(angle = 30, size = 7, hjust = 1),
legend.position = "bottom",
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
) +
labs(
title = glue::glue("{sample_id}: {str_to_title(title)}")
)
}
# Wrapper function to compare results between approaches
# Makes two plots:
# - heatmap comparing counts for cell labels between approaches
# - density plot of annotation scores for labels that agree and disagree between approaches
compare <- function(df, compare_column, score_column, title) {
spread_df <- df |>
select({{ compare_column }}, barcode, version) |>
pivot_wider(names_from = version, values_from = {{ compare_column }})
heatmap <- plot_count_heatmap(spread_df, title, unique(df$sample_id))
disagree_barcodes <- spread_df |>
filter(azimuth != adapted_azimuth) |>
pull(barcode)
df2 <- df |>
mutate(
agree = ifelse(barcode %in% disagree_barcodes, "labels disagree", "labels agree"),
agree = fct_relevel(agree, "labels disagree", "labels agree")
)
density_plot <- ggplot(df2) +
aes(x = {{ score_column }}, fill = agree) +
geom_density(alpha = 0.6) +
theme_bw() +
ggtitle(
glue::glue("Disagree count: {length(disagree_barcodes)} out of {nrow(spread_df)}")
) +
theme(legend.position = "bottom")
print(heatmap + density_plot + plot_layout(widths = c(2, 1)))
}
This section both:
If results are already available, we read in the files rather than regenerating results.
# sample ids to process
sample_ids <- c("SCPCS000179", "SCPCS000184", "SCPCS000194", "SCPCS000205", "SCPCS000208")
# read in seurat input objects, as needed
if ((!file.exists(full_results_file)) || (!file.exists(kidney_results_file))) {
srat_objects <- sample_ids |>
purrr::map(
\(id) {
srat <- readRDS(
file.path(result_dir, id, glue::glue("01-Seurat_{id}.Rds"))
)
DefaultAssay(srat) <- "RNA"
return(srat)
}
)
names(srat_objects) <- sample_ids
}
if (!file.exists(full_results_file)) {
# read reference
ref <- readRDS(file.path(
module_base,
"results",
"references",
"cao_formatted_ref.rds"
))
full_reference <- ref$reference
full_refdata <- ref$refdata
full_dims <- ref$dims
full_annotation_columns <- c(
glue::glue("predicted.{ref$annotation_levels}"),
glue::glue("predicted.{ref$annotation_levels}.score")
)
# Perform label transfer with new code
assay <- "RNA"
fetal_full <- srat_objects |>
purrr::imap(
\(srat, id) {
set.seed(params$seed)
query <- prepare_query(
srat,
rownames(full_reference),
assay,
file.path(module_base, "scratch", "homologs.rds")
)
query <- transfer_labels(
query,
full_reference,
full_dims,
full_refdata,
query.assay = assay
)
# Read in results from existing Azimuth label transfer code
srat_02a <- readRDS(
file.path(result_dir, id, glue::glue("02a-fetal_full_label-transfer_{id}.Rds"))
)
# create final data frame with all annotations
query@meta.data[, full_annotation_columns] |>
tibble::rownames_to_column(var = "barcode") |>
mutate(
sample_id = id,
version = "adapted_azimuth"
) |>
# existing results
bind_rows(
data.frame(
sample_id = id,
barcode = colnames(srat_02a),
version = "azimuth",
predicted.annotation.l1 = srat_02a$fetal_full_predicted.annotation.l1,
predicted.annotation.l1.score = srat_02a$fetal_full_predicted.annotation.l1.score,
predicted.annotation.l2 = srat_02a$fetal_full_predicted.annotation.l2,
predicted.annotation.l2.score = srat_02a$fetal_full_predicted.annotation.l2.score,
predicted.organ = srat_02a$fetal_full_predicted.organ,
predicted.organ.score = srat_02a$fetal_full_predicted.organ.score
)
)
}
)
write_rds(fetal_full, full_results_file)
} else {
fetal_full <- read_rds(full_results_file)
}
if (!file.exists(kidney_results_file)) {
# read reference
ref <- readRDS(file.path(
module_base,
"results",
"references",
"stewart_formatted_ref.rds"
))
# Pull out information from the reference object we need for label transfer
kidney_reference <- ref$reference
kidney_refdata <- ref$refdata
kidney_dims <- ref$dims
kidney_annotation_columns <- c(
glue::glue("predicted.{ref$annotation_levels}"),
glue::glue("predicted.{ref$annotation_levels}.score")
)
# Perform label transfer with new code
assay <- "RNA"
fetal_kidney <- srat_objects |>
purrr::imap(
\(srat, id) {
set.seed(params$seed)
query <- prepare_query(
srat,
rownames(kidney_reference),
assay,
file.path(module_base, "scratch", "homologs.rds")
)
query <- transfer_labels(
query,
kidney_reference,
kidney_dims,
kidney_refdata,
query.assay = assay
)
# Read in results from existing Azimuth label transfer code
srat_02b <- readRDS(
file.path(result_dir, id, glue::glue("02b-fetal_kidney_label-transfer_{id}.Rds"))
)
# create final data frame with all annotations
query@meta.data[, kidney_annotation_columns] |>
tibble::rownames_to_column(var = "barcode") |>
mutate(
sample_id = id,
version = "adapted_azimuth"
) |>
# existing results
bind_rows(
data.frame(
sample_id = id,
barcode = colnames(srat_02b),
version = "azimuth",
predicted.compartment = srat_02b$fetal_kidney_predicted.compartment,
predicted.compartment.score = srat_02b$fetal_kidney_predicted.compartment.score,
predicted.cell_type = srat_02b$fetal_kidney_predicted.cell_type,
predicted.cell_type.score = srat_02b$fetal_kidney_predicted.cell_type.score
)
)
}
)
write_rds(fetal_kidney, kidney_results_file)
} else {
fetal_kidney <- read_rds(kidney_results_file)
}
We expect: - The majority of annotations match between approaches, with heatmap counts primarily falling along the diagonal - Any annotations that disagree should have low scores
Note that results from the L2 reference are not plotted because they are not used in cell type annotation.
fetal_full |>
purrr::walk(
\(dat) {
compare(dat, predicted.annotation.l1, predicted.annotation.l1.score, "l1")
compare(dat, predicted.organ, predicted.organ.score, "organ")
}
)
fetal_kidney |>
purrr::walk(
\(dat) {
compare(dat, predicted.compartment, predicted.compartment.score, "compartment")
compare(dat, predicted.cell_type, predicted.cell_type.score, "cell_type")
}
)
The vast majority of the time, labels agree. Generally speaking, when labels do not agree, their annotation scores are much lower, which is as expected.
Additional notable differences are shown in tables below:
| Sample | Reference | Count | Azimuth | Azimuth-adapted |
|---|---|---|---|---|
SCPCS000179 |
L1 | 70 | Metanephric cells | Intestinal epithelial cells |
SCPCS000179 |
Organ | 64 | Kidney | Intestine |
SCPCS000179 |
Organ | 20 | Lung | Kidney |
SCPCS000194 |
L1 | 60 | Stromal cells | Mesangial cells |
SCPCS000194 |
Organ | 35 | Kidney | Intestine |
SCPCS000194 |
Organ | 36 | Lung | Kidney |
SCPCS000205 |
Organ | 56 | Kidney | Intestine |
SCPCS000208 |
L1 | 101 | Mesangial cells | Metanephric cells |
SCPCS000208 |
L1 | 75 | Intestinal epithelial cells | Metanephric cells |
SCPCS000208 |
Organ | 149 | Kidney | Intestine |
kidney cell vs podocytekidney epithelial cell vs kidney cellmesenchymal cell vs
mesenchymal stem cell| Sample | Reference | Count | Azimuth | Azimuth-adapted |
|---|---|---|---|---|
SCPCS000179 |
cell type | 94 | mesenchymal cell | kidney epithelial cell |
SCPCS000205 |
compartment | 52 | fetal nephron | stroma |
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices datasets utils methods base
other attached packages:
[1] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[8] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] deldir_2.0-4 pbapply_1.7-2 gridExtra_2.3 rlang_1.1.4 magrittr_2.0.3 RcppAnnoy_0.0.22
[7] spatstat.geom_3.3-2 matrixStats_1.3.0 ggridges_0.5.6 compiler_4.4.0 reshape2_1.4.4 png_0.1-8
[13] vctrs_0.6.5 pkgconfig_2.0.3 fastmap_1.2.0 labeling_0.4.3 utf8_1.2.4 promises_1.3.0
[19] rmarkdown_2.28 tzdb_0.4.0 xfun_0.47 jsonlite_1.8.8 goftest_1.2-3 later_1.3.2
[25] spatstat.utils_3.1-0 irlba_2.3.5.1 parallel_4.4.0 cluster_2.1.6 R6_2.5.1 ica_1.0-3
[31] spatstat.data_3.1-2 stringi_1.8.4 RColorBrewer_1.1-3 reticulate_1.38.0 spatstat.univar_3.0-0 parallelly_1.38.0
[37] scattermore_1.2 lmtest_0.9-40 Rcpp_1.0.13 knitr_1.48 tensor_1.5 future.apply_1.11.2
[43] zoo_1.8-12 sctransform_0.4.1 httpuv_1.6.15 Matrix_1.7-0 splines_4.4.0 igraph_2.0.3
[49] timechange_0.3.0 tidyselect_1.2.1 abind_1.4-5 rstudioapi_0.16.0 yaml_2.3.10 spatstat.random_3.3-1
[55] spatstat.explore_3.3-2 codetools_0.2-20 miniUI_0.1.1.1 listenv_0.9.1 plyr_1.8.9 lattice_0.22-6
[61] shiny_1.9.1 withr_3.0.1 ROCR_1.0-11 evaluate_0.24.0 Rtsne_0.17 future_1.34.0
[67] fastDummies_1.7.4 survival_3.7-0 polyclip_1.10-7 fitdistrplus_1.2-1 pillar_1.9.0 BiocManager_1.30.25
[73] KernSmooth_2.23-24 renv_1.0.7 plotly_4.10.4 generics_0.1.3 rprojroot_2.0.4 RcppHNSW_0.6.0
[79] hms_1.1.3 munsell_0.5.1 scales_1.3.0 globals_0.16.3 xtable_1.8-4 glue_1.7.0
[85] lazyeval_0.2.2 tools_4.4.0 data.table_1.16.0 RSpectra_0.16-2 RANN_2.6.2 leiden_0.4.3.1
[91] dotCall64_1.1-1 cowplot_1.1.3 grid_4.4.0 colorspace_2.1-1 nlme_3.1-166 cli_3.6.3
[97] spatstat.sparse_3.1-0 spam_2.10-0 fansi_1.0.6 viridisLite_0.4.2 uwot_0.2.2 gtable_0.3.5
[103] digest_0.6.37 progressr_0.14.0 ggrepel_0.9.5 farver_2.1.2 htmlwidgets_1.6.4 htmltools_0.5.8.1
[109] lifecycle_1.0.4 httr_1.4.7 mime_0.12 MASS_7.3-61